{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e0b6a187-f5a6-49ea-ac44-fa74d39d2717",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /Users/claudiac/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/claudiac/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import nltk\n",
    "import string\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "from nltk.corpus import stopwords\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import Counter\n",
    "\n",
    "import ssl\n",
    "\n",
    "try:\n",
    "    _create_unverified_https_context = ssl._create_unverified_context\n",
    "except AttributeError:\n",
    "    pass\n",
    "else:\n",
    "    ssl._create_default_https_context = _create_unverified_https_context\n",
    "\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e203e16d-9a30-489f-80cc-5e10c0100fd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"data/whole_corpus.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a867be8d-9916-454f-978b-50ed3a52a5b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating single strings per model\n",
    "\n",
    "grouped_by_category = df.groupby(['model'])['text'].apply(lambda x: ' '.join(x)).reset_index()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8a03b40e-9863-4e1c-a32d-a50dfd122b77",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating single strings per model and author\n",
    "\n",
    "grouped = df.groupby(['model', 'author'])['text'].apply(lambda x: ' '.join(x)).reset_index()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "63a9c113-3259-4f2d-a4c3-0d6270acd46f",
   "metadata": {},
   "source": [
    "## Sentence Lengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c25604fb-3949-4b95-966d-c5ffc2c8b0af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create punctuation list\n",
    "punc_list = list(string.punctuation) + ['``', '\"', \"''\"]\n",
    "\n",
    "# Function to calculate average sentence length\n",
    "def sentence_length_stats(text):\n",
    "    sentences = sent_tokenize(text)\n",
    "    no_sentences = len(sentences) if sentences else 1  # Avoid division by zero\n",
    "    tokens = word_tokenize(text)\n",
    "    tokens = [token for token in tokens if token not in punc_list]\n",
    "    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]\n",
    "    mean_len = np.mean(sentence_lengths) if sentence_lengths else 0\n",
    "    std_dev_len = np.std(sentence_lengths, ddof=1) if len(sentence_lengths) > 1 else 0  # ddof=1 for sample std dev\n",
    "    return mean_len, std_dev_len\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3013a60a-ce58-4bc8-9038-0e53a458dda7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating average sentence lengths and standard deviations for models overall and saving to new columns\n",
    "grouped_by_category[['avg_sentence_length', 'std_sentence_length']] = grouped_by_category['text'].apply(lambda x: pd.Series(sentence_length_stats(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "9f560cf0-c677-4997-b21d-0f55d47e635d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating average sentence lengths and standard deviations for models per author and saving to new columns\n",
    "grouped[['avg_sentence_length', 'std_sentence_length']] = grouped['text'].apply(lambda x: pd.Series(sentence_length_stats(x)))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6f51631f-c301-4f11-843d-aea8801d40ca",
   "metadata": {},
   "source": [
    "## Gendered Pronouns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a117de4a-fb42-480b-b281-ed02a0c9bd0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create function to calculate male pronouns\n",
    "\n",
    "def male_pronouns(text):\n",
    "    tokens = word_tokenize(text)\n",
    "    tokens = [token.lower() for token in tokens if token.isalpha()]  # Lowercase and filter for words   \n",
    "    # Calculate male pronoun relative frequency\n",
    "    male_pronouns_list = [\"him\", \"his\", \"he\", \"himself\"]\n",
    "    male_count = sum(tokens.count(pronoun) for pronoun in male_pronouns_list)\n",
    "    male_pronoun_rate = male_count / len(tokens)\n",
    "    return male_pronoun_rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "a92bbe25-fc00-4f5d-a68a-8f054102e96d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating male pronoun rate for models overall\n",
    "grouped_by_category['male_pronoun_rate'] = grouped_by_category['text'].apply(male_pronouns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f87c162f-8659-461b-ad34-06cac2d84904",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating male pronoun rate by model and author\n",
    "grouped['male_pronoun_rate'] = grouped['text'].apply(male_pronouns)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2e835adb-bbd8-4825-a067-78bc31803202",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create function to calculate female pronouns\n",
    "\n",
    "def female_pronouns(text):\n",
    "    tokens = word_tokenize(text)\n",
    "    tokens = [token.lower() for token in tokens if token.isalpha()]  # Lowercase and filter for words \n",
    "    female_pronouns_list = [\"her\", \"hers\", \"she\", \"herself\"]\n",
    "    female_count = sum(tokens.count(pronoun) for pronoun in female_pronouns_list)\n",
    "    female_pronoun_rate = female_count / len(tokens)\n",
    "    return female_pronoun_rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9b8ff9b7-8f1c-4cf9-b5c7-26c8e851024f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating female pronoun rate for models overall\n",
    "grouped_by_category['female_pronoun_rate'] = grouped_by_category['text'].apply(female_pronouns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ba4d6f03-6381-44cb-b306-a6348327059b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating female pronoun rate by model and author\n",
    "grouped['female_pronoun_rate'] = grouped['text'].apply(female_pronouns)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb1c61cb-bf79-4b4d-9eea-ede6150a7b59",
   "metadata": {},
   "source": [
    "## Moving Average TTR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "089468fb-e724-41d1-82f0-322d62f68dd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create function to calculate moving average type-token ratio\n",
    "\n",
    "def calculate_mattr(text, window_size=150):\n",
    "    tokens = word_tokenize(text)  # Tokenize text into words\n",
    "    tokens = [token.lower() for token in tokens if token.isalpha()]  # Keep only alphabetic words\n",
    "    total_tokens = len(tokens)\n",
    "    if total_tokens == 0:\n",
    "        return 0\n",
    "    mattr_values = [len(set(tokens[i:i + window_size])) / window_size for i in range(total_tokens - window_size + 1)]\n",
    "    return np.mean(mattr_values)  \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "4687f788-8e10-487c-846c-396025017a1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate MATTR for models overall\n",
    "grouped_by_category['MATTR'] = grouped_by_category['text'].apply(lambda x: calculate_mattr(x, window_size=150))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5077a6ea-7582-4df9-94ac-bb362a1ada72",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Calculate MATTR by model and author\n",
    "grouped['MATTR'] = grouped['text'].apply(lambda x: calculate_mattr(x, window_size=150))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b4eba77-f92c-4e5d-9fe4-005be900d7e2",
   "metadata": {},
   "source": [
    "## Content Words ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e8a6ccad-1998-4f88-9706-d1a07b88a1b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Define stop words from NLTK list\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "# Create function to clean and tokenize texts\n",
    "def preprocess_and_tokenize(text):\n",
    "    words = nltk.word_tokenize(text.lower())\n",
    "    content_words = [word for word in words if word.isalpha() and word not in stop_words]\n",
    "    return content_words\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "1848df99-5027-4575-b1fb-b20bd0e64ff9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize each text for models overall\n",
    "\n",
    "grouped_by_category['tokens'] = grouped_by_category['text'].apply(preprocess_and_tokenize)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "7dc64bc7-9494-4303-ba41-41f144eebc2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "281f7c82-2233-4b6f-9e53-bcf0c78c5319",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create empty lists to store content word relative frequencies\n",
    "\n",
    "relative_frequencies = []\n",
    "top_20_relative_frequencies = []\n",
    "\n",
    "# Calculate relative frequencies of each content word for each model, and save all of them to one column, and the top 20 to the next column\n",
    "\n",
    "for index, row in grouped_by_category.iterrows():\n",
    "    content_words = row[\"tokens\"]\n",
    "    total_words = len(content_words)\n",
    "    word_count_dict = Counter(content_words)\n",
    "    relative_frequency_dict = {word: count / total_words for word, count in word_count_dict.items()}\n",
    "    relative_frequencies.append(relative_frequency_dict)\n",
    "    top_20_words = dict(sorted(relative_frequency_dict.items(), key=lambda x: x[1], reverse=True)[:20])\n",
    "    top_20_relative_frequencies.append(top_20_words)\n",
    "\n",
    "grouped_by_category[\"relative_frequencies\"] = relative_frequencies\n",
    "grouped_by_category[\"top_20_relative_frequencies\"] = top_20_relative_frequencies\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "09fb0e89-1c4f-4f68-8058-cf3c07a154b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculating top 20 content words per author and model\n",
    "\n",
    "grouped['tokens'] = grouped['text'].apply(preprocess_and_tokenize)\n",
    "\n",
    "relative_frequencies_list = []\n",
    "top_20_relative_frequencies = []\n",
    "\n",
    "for index, row in grouped.iterrows():\n",
    "    content_words = row[\"tokens\"]\n",
    "    total_words = len(content_words)\n",
    "    word_count_dict = Counter(content_words)\n",
    "    relative_frequency_dict = {word: count / total_words for word, count in word_count_dict.items()}\n",
    "    relative_frequencies_list.append(relative_frequency_dict) \n",
    "    top_20_words = dict(sorted(relative_frequency_dict.items(), key=lambda x: x[1], reverse=True)[:20])\n",
    "    top_20_relative_frequencies.append(top_20_words)\n",
    "\n",
    "grouped[\"relative_frequencies\"] = relative_frequencies_list\n",
    "grouped[\"top_20_relative_frequencies\"] = top_20_relative_frequencies\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "81852964-ac98-40c5-a25a-2d37a22dd8bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model</th>\n",
       "      <th>text</th>\n",
       "      <th>avg_sentence_length</th>\n",
       "      <th>std_sentence_length</th>\n",
       "      <th>male_pronoun_rate</th>\n",
       "      <th>female_pronoun_rate</th>\n",
       "      <th>MATTR</th>\n",
       "      <th>tokens</th>\n",
       "      <th>relative_frequencies</th>\n",
       "      <th>top_20_relative_frequencies</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>authentic</td>\n",
       "      <td>_Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...</td>\n",
       "      <td>26.288286</td>\n",
       "      <td>22.128578</td>\n",
       "      <td>0.029714</td>\n",
       "      <td>0.020081</td>\n",
       "      <td>0.658885</td>\n",
       "      <td>[jean, muir, come, mamma, yet, wish, well, tho...</td>\n",
       "      <td>{'jean': 5.3210539779994885e-05, 'muir': 3.661...</td>\n",
       "      <td>{'said': 0.010786049287967937, 'would': 0.0072...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>gpt3.5</td>\n",
       "      <td>In a quaint little town nestled amidst the rol...</td>\n",
       "      <td>30.043777</td>\n",
       "      <td>8.247714</td>\n",
       "      <td>0.015494</td>\n",
       "      <td>0.021028</td>\n",
       "      <td>0.678479</td>\n",
       "      <td>[quaint, little, town, nestled, amidst, rollin...</td>\n",
       "      <td>{'quaint': 0.001599350051362106, 'little': 0.0...</td>\n",
       "      <td>{'like': 0.009355772441412107, 'shadows': 0.00...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>gpt4</td>\n",
       "      <td>Miss Clara Wharton, cloaked in the quiet elega...</td>\n",
       "      <td>32.635082</td>\n",
       "      <td>10.529069</td>\n",
       "      <td>0.012923</td>\n",
       "      <td>0.021179</td>\n",
       "      <td>0.705278</td>\n",
       "      <td>[miss, clara, wharton, cloaked, quiet, eleganc...</td>\n",
       "      <td>{'miss': 0.0032572487988895052, 'clara': 0.000...</td>\n",
       "      <td>{'yet': 0.006826966232556202, 'heart': 0.00551...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       model                                               text  \\\n",
       "0  authentic  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...   \n",
       "1     gpt3.5  In a quaint little town nestled amidst the rol...   \n",
       "2       gpt4  Miss Clara Wharton, cloaked in the quiet elega...   \n",
       "\n",
       "   avg_sentence_length  std_sentence_length  male_pronoun_rate  \\\n",
       "0            26.288286            22.128578           0.029714   \n",
       "1            30.043777             8.247714           0.015494   \n",
       "2            32.635082            10.529069           0.012923   \n",
       "\n",
       "   female_pronoun_rate     MATTR  \\\n",
       "0             0.020081  0.658885   \n",
       "1             0.021028  0.678479   \n",
       "2             0.021179  0.705278   \n",
       "\n",
       "                                              tokens  \\\n",
       "0  [jean, muir, come, mamma, yet, wish, well, tho...   \n",
       "1  [quaint, little, town, nestled, amidst, rollin...   \n",
       "2  [miss, clara, wharton, cloaked, quiet, eleganc...   \n",
       "\n",
       "                                relative_frequencies  \\\n",
       "0  {'jean': 5.3210539779994885e-05, 'muir': 3.661...   \n",
       "1  {'quaint': 0.001599350051362106, 'little': 0.0...   \n",
       "2  {'miss': 0.0032572487988895052, 'clara': 0.000...   \n",
       "\n",
       "                         top_20_relative_frequencies  \n",
       "0  {'said': 0.010786049287967937, 'would': 0.0072...  \n",
       "1  {'like': 0.009355772441412107, 'shadows': 0.00...  \n",
       "2  {'yet': 0.006826966232556202, 'heart': 0.00551...  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped_by_category.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "fb550e33-51da-40dd-8854-8d484947c0d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped.to_csv(\"data/HDSR_summary_stats_final.csv\")\n",
    "grouped_by_category.to_csv(\"data/HDSR_overall_summary_stats_final.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a99bce2-511d-487e-badf-27100bf5d813",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
